# Author: Stephen Situ
# Logistic Regression models the probability of an event taking place by having the log odds be a linear combination
# of the features. They are fitted into a sigmoid function sigma(x) = (1/(1+exp(-x)) where the output can range from 0 to 1.
# This is useful for predicting binary outcomes by defining thresholds (usually 0-0.499 & 0.500-1).
# We take a sample of breast cancer data that is diagnosed as benign ("B") or malignant ("M") (cancer) and train
# a logistic regression model.
# Original dataset: https://www.kaggle.com/datasets/vijayaadithyanvg/breast-cancer-prediction
# Import Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# read CSV
breast_cancer_data = pd.read_csv("data.csv")
# Head
breast_cancer_data.head
<bound method NDFrame.head of id diagnosis Radius_mean Texture_mean perimeter_mean area_mean \ 0 842302 M 17.99 10.38 122.80 1001.0 1 842517 M 20.57 21.77 132.90 1326.0 2 84300903 M 19.69 21.25 130.00 1203.0 3 84348301 M 11.42 20.38 77.58 386.1 4 84358402 M 20.29 14.34 135.10 1297.0 .. ... ... ... ... ... ... 564 926424 M 21.56 22.39 142.00 1479.0 565 926682 M 20.13 28.25 131.20 1261.0 566 926954 M 16.60 28.08 108.30 858.1 567 927241 M 20.60 29.33 140.10 1265.0 568 92751 B 7.76 24.54 47.92 181.0 smoothness_mean compactness_mean concavity_mean concave points_mean \ 0 0.11840 0.27760 0.30010 0.14710 1 0.08474 0.07864 0.08690 0.07017 2 0.10960 0.15990 0.19740 0.12790 3 0.14250 0.28390 0.24140 0.10520 4 0.10030 0.13280 0.19800 0.10430 .. ... ... ... ... 564 0.11100 0.11590 0.24390 0.13890 565 0.09780 0.10340 0.14400 0.09791 566 0.08455 0.10230 0.09251 0.05302 567 0.11780 0.27700 0.35140 0.15200 568 0.05263 0.04362 0.00000 0.00000 ... radius_worst texture_worst perimeter_worst area_worst \ 0 ... 25.380 17.33 184.60 2019.0 1 ... 24.990 23.41 158.80 1956.0 2 ... 23.570 25.53 152.50 1709.0 3 ... 14.910 26.50 98.87 567.7 4 ... 22.540 16.67 152.20 1575.0 .. ... ... ... ... ... 564 ... 25.450 26.40 166.10 2027.0 565 ... 23.690 38.25 155.00 1731.0 566 ... 18.980 34.12 126.70 1124.0 567 ... 25.740 39.42 184.60 1821.0 568 ... 9.456 30.37 59.16 268.6 smoothness_worst compactness_worst concavity_worst \ 0 0.16220 0.66560 0.7119 1 0.12380 0.18660 0.2416 2 0.14440 0.42450 0.4504 3 0.20980 0.86630 0.6869 4 0.13740 0.20500 0.4000 .. ... ... ... 564 0.14100 0.21130 0.4107 565 0.11660 0.19220 0.3215 566 0.11390 0.30940 0.3403 567 0.16500 0.86810 0.9387 568 0.08996 0.06444 0.0000 concave points_worst symmetry_worst fractal_dimension_worst 0 0.2654 0.4601 0.11890 1 0.1860 0.2750 0.08902 2 0.2430 0.3613 0.08758 3 0.2575 0.6638 0.17300 4 0.1625 0.2364 0.07678 .. ... ... ... 564 0.2216 0.2060 0.07115 565 0.1628 0.2572 0.06637 566 0.1418 0.2218 0.07820 567 0.2650 0.4087 0.12400 568 0.0000 0.2871 0.07039 [569 rows x 32 columns]>
# describe
breast_cancer_data.describe
<bound method NDFrame.describe of id diagnosis Radius_mean Texture_mean perimeter_mean area_mean \ 0 842302 M 17.99 10.38 122.80 1001.0 1 842517 M 20.57 21.77 132.90 1326.0 2 84300903 M 19.69 21.25 130.00 1203.0 3 84348301 M 11.42 20.38 77.58 386.1 4 84358402 M 20.29 14.34 135.10 1297.0 .. ... ... ... ... ... ... 564 926424 M 21.56 22.39 142.00 1479.0 565 926682 M 20.13 28.25 131.20 1261.0 566 926954 M 16.60 28.08 108.30 858.1 567 927241 M 20.60 29.33 140.10 1265.0 568 92751 B 7.76 24.54 47.92 181.0 smoothness_mean compactness_mean concavity_mean concave points_mean \ 0 0.11840 0.27760 0.30010 0.14710 1 0.08474 0.07864 0.08690 0.07017 2 0.10960 0.15990 0.19740 0.12790 3 0.14250 0.28390 0.24140 0.10520 4 0.10030 0.13280 0.19800 0.10430 .. ... ... ... ... 564 0.11100 0.11590 0.24390 0.13890 565 0.09780 0.10340 0.14400 0.09791 566 0.08455 0.10230 0.09251 0.05302 567 0.11780 0.27700 0.35140 0.15200 568 0.05263 0.04362 0.00000 0.00000 ... radius_worst texture_worst perimeter_worst area_worst \ 0 ... 25.380 17.33 184.60 2019.0 1 ... 24.990 23.41 158.80 1956.0 2 ... 23.570 25.53 152.50 1709.0 3 ... 14.910 26.50 98.87 567.7 4 ... 22.540 16.67 152.20 1575.0 .. ... ... ... ... ... 564 ... 25.450 26.40 166.10 2027.0 565 ... 23.690 38.25 155.00 1731.0 566 ... 18.980 34.12 126.70 1124.0 567 ... 25.740 39.42 184.60 1821.0 568 ... 9.456 30.37 59.16 268.6 smoothness_worst compactness_worst concavity_worst \ 0 0.16220 0.66560 0.7119 1 0.12380 0.18660 0.2416 2 0.14440 0.42450 0.4504 3 0.20980 0.86630 0.6869 4 0.13740 0.20500 0.4000 .. ... ... ... 564 0.14100 0.21130 0.4107 565 0.11660 0.19220 0.3215 566 0.11390 0.30940 0.3403 567 0.16500 0.86810 0.9387 568 0.08996 0.06444 0.0000 concave points_worst symmetry_worst fractal_dimension_worst 0 0.2654 0.4601 0.11890 1 0.1860 0.2750 0.08902 2 0.2430 0.3613 0.08758 3 0.2575 0.6638 0.17300 4 0.1625 0.2364 0.07678 .. ... ... ... 564 0.2216 0.2060 0.07115 565 0.1628 0.2572 0.06637 566 0.1418 0.2218 0.07820 567 0.2650 0.4087 0.12400 568 0.0000 0.2871 0.07039 [569 rows x 32 columns]>
# dtypes
breast_cancer_data.dtypes
id int64 diagnosis object Radius_mean float64 Texture_mean float64 perimeter_mean float64 area_mean float64 smoothness_mean float64 compactness_mean float64 concavity_mean float64 concave points_mean float64 symmetry_mean float64 fractal_dimension_mean float64 radius_se float64 texture_se float64 perimeter_se float64 area_se float64 smoothness_se float64 compactness_se float64 concavity_se float64 concave points_se float64 symmetry_se float64 fractal_dimension_se float64 radius_worst float64 texture_worst float64 perimeter_worst float64 area_worst float64 smoothness_worst float64 compactness_worst float64 concavity_worst float64 concave points_worst float64 symmetry_worst float64 fractal_dimension_worst float64 dtype: object
# Change id to category
breast_cancer_data["id"] = breast_cancer_data["id"].astype("category")
# Quick Scatterplot Visualization
ax = sns.scatterplot(x='Radius_mean', y='perimeter_mean', data=breast_cancer_data, hue='diagnosis')
ax.set(xlabel='Mean Radius', ylabel='Mean Perimeter', title='Scatterplot of Breast Cancer Diagnosis')
[Text(0.5, 0, 'Mean Radius'), Text(0, 0.5, 'Mean Perimeter'), Text(0.5, 1.0, 'Scatterplot of Breast Cancer Diagnosis')]
# Hot encode categorical variable
bcd = pd.get_dummies(breast_cancer_data.drop(['id'],axis=1))
bcd1 = bcd.drop(['diagnosis_B'],axis=1)
bcd1
Radius_mean | Texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | diagnosis_M | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | ... | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 1 |
1 | 20.57 | 21.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | ... | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 1 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | ... | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 1 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | ... | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 1 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | ... | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
564 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | ... | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 | 1 |
565 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | ... | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 | 1 |
566 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | ... | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 | 1 |
567 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | ... | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 | 1 |
568 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | ... | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 | 0 |
569 rows × 31 columns
# Do train test split using 80/20 split and x & y
train_data, test_data = train_test_split(bcd1, test_size=0.2)
train_data_y = train_data['diagnosis_M']
train_data_x = train_data.drop(columns=['diagnosis_M'])
test_data_y = test_data['diagnosis_M']
test_data_x = test_data.drop(columns=['diagnosis_M'])
# Preform Logistic Regression
log_reg = LogisticRegression(max_iter=3000)
log_reg.fit(train_data_x,train_data_y)
LogisticRegression(max_iter=3000)
# Predict on test data
y_pred = log_reg.predict(test_data_x)
# Confusion Matrix gives us 95.6% accuracy
confusion_matrix(y_pred,test_data_y)
array([[68, 3], [ 2, 41]], dtype=int64)
# Create new dataframe to visualize accuracy
bcd2 = test_data
bcd2["diagnosis_pred"] = y_pred
bcd2.loc[bcd2['diagnosis_M'] == bcd2['diagnosis_pred'], 'Accuracy'] = 'Correct'
bcd2.loc[bcd2['diagnosis_M'] != bcd2['diagnosis_pred'], 'Accuracy'] = 'Incorrect'
bcd2.loc[bcd2['diagnosis_M'] == 1, 'Diagnosis_true'] = 'M'
bcd2.loc[bcd2['diagnosis_M'] != 1, 'Diagnosis_true'] = 'B'
# Cast columns as categorical
bcd2["Accuracy"] = bcd2["Accuracy"].astype("category")
bcd2["Diagnosis_true"] = bcd2["Diagnosis_true"].astype("category")
# Quick view
bcd2
Radius_mean | Texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | diagnosis_M | diagnosis_pred | Accuracy | Diagnosis_true | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
166 | 10.80 | 9.71 | 68.77 | 357.6 | 0.09594 | 0.05736 | 0.025310 | 0.016980 | 0.1381 | 0.06400 | ... | 0.14360 | 0.12570 | 0.10470 | 0.04603 | 0.2090 | 0.07699 | 0 | 0 | Correct | B |
482 | 13.47 | 14.06 | 87.32 | 546.3 | 0.10710 | 0.11550 | 0.057860 | 0.052660 | 0.1779 | 0.06639 | ... | 0.13930 | 0.24990 | 0.18480 | 0.13350 | 0.3227 | 0.09326 | 0 | 0 | Correct | B |
212 | 28.11 | 18.47 | 188.50 | 2499.0 | 0.11420 | 0.15160 | 0.320100 | 0.159500 | 0.1648 | 0.05525 | ... | 0.11420 | 0.15160 | 0.32010 | 0.15950 | 0.1648 | 0.05525 | 1 | 1 | Correct | M |
562 | 15.22 | 30.62 | 103.40 | 716.9 | 0.10480 | 0.20870 | 0.255000 | 0.094290 | 0.2128 | 0.07152 | ... | 0.14170 | 0.79170 | 1.17000 | 0.23560 | 0.4089 | 0.14090 | 1 | 1 | Correct | M |
510 | 11.74 | 14.69 | 76.31 | 426.0 | 0.08099 | 0.09661 | 0.067260 | 0.026390 | 0.1499 | 0.06758 | ... | 0.10730 | 0.27930 | 0.26900 | 0.10560 | 0.2604 | 0.09879 | 0 | 0 | Correct | B |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
157 | 16.84 | 19.46 | 108.40 | 880.2 | 0.07445 | 0.07223 | 0.051500 | 0.027710 | 0.1844 | 0.05268 | ... | 0.08774 | 0.17100 | 0.18820 | 0.08436 | 0.2527 | 0.05972 | 0 | 1 | Incorrect | B |
296 | 10.91 | 12.35 | 69.14 | 363.7 | 0.08518 | 0.04721 | 0.012360 | 0.013690 | 0.1449 | 0.06031 | ... | 0.09312 | 0.07506 | 0.02884 | 0.03194 | 0.2143 | 0.06643 | 0 | 0 | Correct | B |
396 | 13.51 | 18.89 | 88.10 | 558.1 | 0.10590 | 0.11470 | 0.085800 | 0.053810 | 0.1806 | 0.06079 | ... | 0.14280 | 0.25700 | 0.34380 | 0.14530 | 0.2666 | 0.07686 | 0 | 0 | Correct | B |
334 | 12.30 | 19.02 | 77.88 | 464.4 | 0.08313 | 0.04202 | 0.007756 | 0.008535 | 0.1539 | 0.05945 | ... | 0.12220 | 0.09052 | 0.03619 | 0.03983 | 0.2554 | 0.07207 | 0 | 0 | Correct | B |
495 | 14.87 | 20.21 | 96.12 | 680.9 | 0.09587 | 0.08345 | 0.068240 | 0.049510 | 0.1487 | 0.05748 | ... | 0.12160 | 0.13880 | 0.17000 | 0.10170 | 0.2369 | 0.06599 | 0 | 0 | Correct | B |
114 rows × 34 columns
# Create scatter plot to visualize result
gx = sns.scatterplot(x='Radius_mean', y='perimeter_mean', data=bcd2, hue='Accuracy',style="Diagnosis_true")
gx.set(xlabel='Mean Radius', ylabel='Mean Perimeter', title='Scatterplot of Breast Cancer Logistic Regression Prediction On Test Data')
[Text(0.5, 0, 'Mean Radius'), Text(0, 0.5, 'Mean Perimeter'), Text(0.5, 1.0, 'Scatterplot of Breast Cancer Logistic Regression Prediction On Test Data')]
# Other Parameters
print('Intercept is', log_reg.intercept_)
print('Coefficients are', log_reg.coef_)
Intercept is [-23.06487791] Coefficients are [[-0.89510767 -0.19950075 0.19813797 -0.01709282 0.15074503 0.18661899 0.48322892 0.24736118 0.25028976 0.02861489 0.05131253 -0.94746801 0.1976007 0.08442126 0.02079837 -0.03942007 0.06354866 0.03564237 0.03529912 -0.01156539 -0.48316096 0.415974 0.12568589 0.01491001 0.29112951 0.69990637 1.48231777 0.50418023 0.67023358 0.09829928]]